{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/summarization/dailymail/translated-dailymail-train.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/summarization/cnn-news/translated-cnn-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import json\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['/home/husein/news/translated-dailymail-train.json',\n",
    "        '/home/husein/news/translated-cnn-train.json']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(files[1]) as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Lebih 200 kilogram coklat dimakan di Cafe Fleuri\\'s buffet Chocolate Bar setiap Sabtu. Coklat dipaparkan dalam setiap kursus makan malam dan sarapan pagi untuk pakej indulgensi coklat Three Way House Hotel. The Hotel Hershey of Hersheypark menjadi tuan rumah kepada pemburuan telur Easter tahunan yang dikatakan \"legenda\".'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(data[0]['ms_abstract'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/news/translated-dailymail-train.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 209506/209506 [01:08<00:00, 3061.87it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/news/translated-cnn-train.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 90579/90579 [00:28<00:00, 3153.04it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('summarization.tsv', \"w\") as outfile:\n",
    "    for file in files:\n",
    "        print(file)\n",
    "        with open(file) as fopen:\n",
    "            data = json.load(fopen)\n",
    "        for i in tqdm(range(len(data))):\n",
    "            l = ' '.join(data[i]['ms_article'])\n",
    "            r = ' '.join(data[i]['ms_abstract'])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (cleaning(l), cleaning(r)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarization_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'summarization.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def summarization_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('summarization_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'summarization_dataset',\n",
    "    dataset_fn = summarization_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [summarization_preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"summarization_dataset\")\n",
    "ds = nq_task.get_dataset(split='summarization.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b\"ringkasan: Oleh . Bukit Suzannah. PENERBITAN: . 02:56 EST, 18 September 2013. |. DIKEMASKINI: . 04:19 EST, 18 September 2013. Penjara: Robert McLaughlin, 52, dipenjara 16 minggu selepas tersepit di belakang roda lima kali melebihi had dan berpakaian sebagai WPC. Seorang pemandu minuman yang ditemui tergelincir di belakang roda ketika berpakaian seragam pegawai polis wanita telah dipenjarakan. Robert McLaughlin, 52, was five times the drink-drive limit when he was discovered at the wheel of a car wearing a female police top, black trousers and boots, a court heard. Dia juga mempunyai alat dengar komunikasi, dengan wayar masuk ke kolarnya, dan topi keledar polis yang menjadi milik anak tirinya di tempat duduk penumpang. Peguamnya memberitahu mahkamah bahawa dia telah menjalin hubungan dengan pemilik pakaian seragam itu dan mereka akan 'sesekali berpakaian' dalam gearnya. Majistret Wakefield mendengar seorang penunggang basikal hampir dirobohkan oleh sebuah kereta yang cuba mengelak McLaughlin's Citroen, yang diparkir secara menyerong di seberang jalan. Penunggang basikal itu pergi untuk berhadapan dengan McLaughlin dan mendapati dia 'tersungkur dan kelihatan memakai pakaian seragam polis'. Richard Ogden, pendakwa raya, berkata penunggang basikal itu menyifatkan McLaughlin yang menganggur sebagai 'seolah-olah terkeluar' dan 'benar-benar keliru' apabila dia melihatnya di jalan raya di Ackworth, West Yorkshire, kira-kira jam 1.50 tengah hari pada 28 Mei. Percaya pemandu minuman itu menjadi pegawai polis yang 'mungkin mengalami strok', penunggang basikal itu memanggil ambulans. Ujian nafas tepi jalan kemudian mendedahkan bahawa McLaughlin lima kali melebihi had pemandu minuman. Dia ditangkap, dan kemudian dibawa ke hospital kerana jumlah alkohol yang tinggi dalam sistemnya. Ujian darah menunjukkan dia mempunyai 366mg alkohol dalam 100mls darah. Had undang-undang adalah 80mgs. Ogden mendedahkan artikel pakaian seragam milik konstabel wanita dengan aduan perubatan yang melemahkan, yang McLaughlin berteman dan membantu di sekitar rumah. Akhirnya dia memberinya kunci. 'Pegawai yang barang-barangnya mereka dikatakan sangat tertekan untuk mencari barang-barang ini telah hilang, '' kata Encik Ogden. Ban: Majistret Wakefield memenjarakan McLaughlin selama 16 minggu dan melarangnya memandu selama empat tahun. Fleece McLaughlin, dari Wakefield, ditangkap memakai mempamerkan pegawai dalam butiran soalan. Sejak kejadian dia menukar namanya, mahkamah mendengar. Anak tirinya juga seorang polis dan mahkamah diberitahu bahawa topi keledar yang ditemui di tempat duduk itu adalah satu yang lama dia telah rosak dalam latihan. Ogden berkata, sembilan lagi barangan peralatan dan pakaian polis yang dikeluarkan ditemui ketika mereka menggeledah kediaman McLaughlin. Ruth Gill, mitigating, memberitahu mahkamah bahawa daripada hanya berteman dengan pegawai, McLaughlin telah menjalin hubungan dengannya selama 18 bulan. Mrs Gill berkata: 'Semasa tempoh itu mereka digunakan untuk sesekali berpakaian dalam barangan polis yang kemudiannya ditemui. ' Pada pendengaran awal, McLaughlin mengakui minum dan memandu dan memiliki pakaian seragam polis ketika tidak menjadi anggota pasukan polis atau konstabel khas. Disebabkan tahap alkohol yang tinggi dalam sistemnya McLaughlin dipenjarakan selama 16 minggu dan diharamkan memandu selama empat tahun.\",\n",
       " 'inputs': array([ 7680,    47,    31,   857,    13,     3,  2444,  1327,  7349,\n",
       "         6452,     3, 12257,  4056,  7481,  9717,    31,    13,     3,\n",
       "        16011,  5788,  4334,    14,   375,   561,   646,     3,  2281,\n",
       "            3,  3479, 31923,    31,    13,     3, 18235,   756,  4334,\n",
       "           14,   375,   561,   646,     3, 18102,    31,  1853, 29843,\n",
       "           14,  5310,    14, 17855,   669,   376,   428, 11225,  5237,\n",
       "           24,   941,  7125,   741,   349,  2995,   102,    22,  6717,\n",
       "           85,   602, 12909,     3,  1064,  2720,  4276,    17,  2822,\n",
       "        19440,    24,   941,  7125,   123,  6717, 12359,   551,   612,\n",
       "          302,    62, 21995,     3,  1853, 29843,    14,  5310,    14,\n",
       "           39,   760,  1221,    15,  7018,     7, 22436,  5864,   146,\n",
       "           57,    39,  5645,    48,    15, 14480,    18,    21,   905,\n",
       "         5331,    21,  3278,   559,   853,    14,  1564,    13,    41,\n",
       "        14918,   287,    20, 22468,    14,    21,  1233,  2702,     3,\n",
       "          160,    93,   118,  1829,  8319,  5181,    14,    28, 10540,\n",
       "          605,    55, 29092,    38,    14,    22,  8302, 20886,   612,\n",
       "           17,    95,  2548,   270, 18811,    38,    24,   234,  1625,\n",
       "         3296,     3,  3390,    38,   614,  1624,    56,    61,    62,\n",
       "        10314,   958,    28,  2419,  1939, 12359,    37,    22,    46,\n",
       "           45,    13,    12,    16,  5445,  5809,  6717,    12,    36,\n",
       "         8461,    38,     3, 22334, 21353,  2150,  1605,   163, 14249,\n",
       "         8127,   473, 27786,    60,   136,   539,    17,  1666, 12570,\n",
       "        29843,    12,    16,  2567,  3125,   258,    14,    17, 27518,\n",
       "          156,  6843,  5519,    24,  7473,   489,     3,  1794,  1125,\n",
       "         6175,  8127,    37,   657,    25,  9527,    28, 29843,    22,\n",
       "         1608,    61,    13,    12,  5359,  1125,  7252,    22,  1043,\n",
       "         2528,  1939, 12359,   612,    12,     3,  2199,   448,   353,\n",
       "         1416,    14,  7334,   672,    14,   291, 14249,  8127,    37,\n",
       "        12898, 29843,    17, 24482,    85,    13,    12,   478,  3879,\n",
       "            7,  3879,   987, 25641,    12,    22,    13,    12,   589,\n",
       "            7,   589, 18965,    12,   721,    61,  3792,    24,   489,\n",
       "          672,    24,    76,  1478,  7311,    14,   825, 25755,    14,\n",
       "          919,     7,   618,   576,   179,     3,  1307,  1073,    89,\n",
       "           33,  1506,  1019,     3,    13, 27365,  2720,  4276,    37,\n",
       "           95,   551,   612,    17,    13,    12,  8647,  1037, 23626,\n",
       "           12,    14, 14249,  8127,    37,  2640, 19663,     3, 17102,\n",
       "        12429,  6499,   489,   365,  3692,    56, 29843,   741,   349,\n",
       "         2995,   102,  2720,  4276,     3,   160,  3164,    14,    22,\n",
       "          365,  2987,    55,  1042,   122,   522,  8139,    17,   329,\n",
       "           36,   671,    38,     3, 17102,  3319,   308,    61,   118,\n",
       "          223,  5370,   117,   353,  8139,    36,   865,   117,  8222,\n",
       "         3319,     3,  8363,   500,     7,   304,    52,  2547,   117,\n",
       "         9618,     3,   448,   353,  1416,  3692,  4130,  1939, 12359,\n",
       "         2548,  6933,  3997,  3534,   302,    28,  6167,  2070,    17,\n",
       "        13014,    14,    17, 29843, 28524,    22,   470,    24,   462,\n",
       "          233,     3,  7720,    61, 12506,  5260,     3,    13,    12,\n",
       "         4072, 20543,    91,    17,  1724,     7,  5068,    38,    46,\n",
       "         2041,   176, 12684,    25,   627,  1724,     7,  5068,    34,\n",
       "           62,  1465,    14,    13,    12,    12,    98,  4794,   448,\n",
       "          353,  1416,     3,  3540,    31, 22334, 21353,  2150,   227,\n",
       "          829,  1108,  3293, 29843,   296,   669,   376,    22,  5263,\n",
       "           38,  4117,   296,   662,    53,     3, 16996,    81,   841,\n",
       "        29843,    14,    42, 21353,  2150,    14,  3164,  2528, 14443,\n",
       "          551,    36, 11988,  2762,     3,  3216,  1798,    61,  5439,\n",
       "         4516,    14,  1624,  1605,     3,  3852, 18811,    38,    93,\n",
       "          163,   612,    22,  1624,  4405,    56,  8302, 20886,    17,\n",
       "         2822,    24,   234,  1625,    37,    52,   100,    17,   461,\n",
       "           61,    62,  8842,    36,  1855,     3,   448,   353,  1416,\n",
       "          291,    14,  3464,   221,  8599,  4380,    22,  1939,   612,\n",
       "           17,  1572,  2822,   123,    46, 30254,  4003, 29843,     3,\n",
       "        16132, 11336,    14,    13,  1676,  4317,   766,    14,   614,\n",
       "         1624,    56,   109,   169, 28524,    28,   551,    14, 29843,\n",
       "           62, 10314,   958,  4515,   296,   375,   204,     3,  6389,\n",
       "        11336,   291,    31,    13,    12, 25861,  1533,    37,    46,\n",
       "          570,    25, 19102,  6717,    36,  8599,   612,    17,  5718,\n",
       "         2822,     3,    13,    12,   206, 14185,   404,    14, 29843,\n",
       "         2290,  5258,    22,  4117,    22,   505,  1939, 12359,   612,\n",
       "          123,    30,    95,   717,   467,   612,    87,  6933,  3997,\n",
       "         3534,  1769,     3, 23384,  1074,  8139,    17,   329,    36,\n",
       "          671,    38, 29843, 21995,   296,   669,   376,    22, 30930,\n",
       "         4117,   296,   662,    53,     3,     1]),\n",
       " 'targets_plaintext': b'Robert McLaughlin, 52, ditemui di belakang roda berpakaian seperti WPC. Ujian mendedahkan dia lima kali melebihi had pemacu minuman sah. McLaughlin dipenjarakan selama 16 minggu dan diharamkan memandu selama empat tahun.',\n",
       " 'targets': array([ 1853, 29843,    14,  5310,    14,  2822,    24,   941,  7125,\n",
       "         6717,   105,   602, 12909,     3, 17102,  3692,    61,   741,\n",
       "          349,  2995,   102, 18222,  4276,  2803,     3, 29843, 21995,\n",
       "          296,   669,   376,    22, 30930,  4117,   296,   662,    53,\n",
       "            3,     1])}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
